In [1]:
from everything import *
In [2]:
from brede.data.neurosynth import NeurosynthDatabase
from brede.data.pubmed import Pubmed
from brede.data.words import CognitiveWords
from brede.core.matrix import Matrix
from brede.data.sbs2 import SBS2Data
In [3]:
# Log to logfile named 'brede.log'
import logging
logger = logging.getLogger()
file_handler = logging.FileHandler(filename='brede.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)
In [4]:
pubmed = Pubmed()
In [5]:
# Load Neurosynth
nd = NeurosynthDatabase()
nd_database = nd.database()
In [6]:
# Get abstracts for Neurosynth papers from PubMed
# This will take some hours time
medlines = pubmed.get_medlines(set(nd_database.id))
In [7]:
# Find keywords in abstracts and add them to a list of list of words
cognitive_words = CognitiveWords()
corpus = []
for n, medline in enumerate(medlines):
abstract = medline.get('AB', '').lower()
keywords = cognitive_words.find_all(abstract)
corpus.append(keywords)
logger.debug(('Iterating over medline abstracts '
'for keyword extraction: {}').format(n))
In [8]:
# Corpus-wide keywords
all_keywords = [word for wordlist in corpus for word in wordlist]
all_unique_keywords = set(all_keywords)
In [9]:
# Build bag-of-phrases matrix
bag_of_phrases = pd.DataFrame(index=[medline['PMID'] for medline in medlines],
columns=list(all_unique_keywords)).fillna(0)
for n, (medline, keywords) in enumerate(zip(medlines, corpus)):
for keyword in keywords:
bag_of_phrases.ix[n, keyword] += 1
if ' ' in keyword:
keyword_parts = keyword.split()
for keyword_part in keyword_parts:
if keyword_part in all_unique_keywords:
bag_of_phrases.ix[n, keyword_part] += 1 / len(keyword_parts)
logger.debug(('Iterating over medline abstracts '
'for matrix construction: {}').format(n))
In [10]:
# Scale bag-of-phrases matrix with IDF
scaled = Matrix(bag_of_phrases).idf()
In [11]:
# Read Smartphone Brain Scanner surface
sbs2_data = SBS2Data()
surface = sbs2_data.surface()
In [12]:
grouped = nd_database[['id', 'x', 'y', 'z']].groupby('id')
v = np.zeros((len(grouped), surface.vertices.shape[0]))
sigma = 10
norm1 = 1 / (sigma * math.sqrt(2 * math.pi))
norm2 = -1 / (2 * sigma ** 2)
In [13]:
# Cortexification of study coordinates
for n in range(len(grouped)):
coords = grouped.get_group(grouped.groups.keys()[n]).ix[:, ['x','y', 'z']]
p = 0
for index, coord in coords.iterrows():
p += norm1 * np.exp(norm2 * np.sum((surface.vertices - coord.values) ** 2, axis=1))
p /= math.sqrt(len(coord))
v[n, :] = p
if not n % 100:
logger.debug(('Iterating over medline abstracts '
'for computing Talairach coordinate load: {}').format(n))
In [14]:
product = v.T.dot(scaled)
product_matrix = Matrix(product, columns=bag_of_phrases.columns)
product_matrix.shape
Out[14]:
In [15]:
product_matrix.to_csv('neurosynth electrode word matrix.csv')
In [ ]: